notebook.community

Edit and run



In [1]:

    
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import math
from sklearn.cross_validation import train_test_split,KFold
import numpy as np



In [2]:

    
datafile_train=r'Data/consumer/Consumer_Complaints_train.csv'
datafile_test=r'Data/consumer/Consumer_Complaints_test.csv'
cd_train=pd.read_csv(datafile_train)
cd_test=pd.read_csv(datafile_test)



In [ ]:

    
cd_train.dtypes



In [3]:

    
for col in ['Date received','Date sent to company']:
    cd_train[col]=pd.to_datetime(cd_train[col],infer_datetime_format=True)
    cd_test[col]=pd.to_datetime(cd_test[col],infer_datetime_format=True)



In [4]:

    
cd_train['day_diff']=pd.to_numeric(cd_train['Date sent to company']-cd_train['Date received'])
cd_test['day_diff']=pd.to_numeric(cd_test['Date sent to company']-cd_test['Date received'])



In [5]:

    
for col in ['Date received','Date sent to company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)



In [ ]:

    
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())



In [ ]:

    
cd_train.isnull().sum()



In [ ]:

    
len(pd.isnull(cd_train['Tags']))
len(cd_train)



In [6]:

    
for col in ['Sub-product','Sub-issue','Consumer complaint narrative',
            'Company public response','Tags','Consumer consent provided?']:
    varname=col.replace('-','_').replace('?','').replace(" ",'_')+'_isNan'
    cd_train[varname]=np.where(pd.isnull(cd_train[col]),1,0)
    cd_train.drop([col],1,inplace=True)
    cd_test[varname]=np.where(pd.isnull(cd_test[col]),1,0)
    cd_test.drop([col],1,inplace=True)



In [ ]:

    
cd_train.head(4)



In [15]:

    
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())



In [7]:

    
for col in ['ZIP code','Company']:
    cd_train.drop([col],1,inplace=True)
    cd_test.drop([col],1,inplace=True)



In [8]:

    
cd_train['Consumer disputed?']=np.where(cd_train['Consumer disputed?']=="Yes",1,0)



In [9]:

    
k=cd_train['Issue'].value_counts()
for val in k.axes[0][0:10]:
    varname='Issue_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['Issue']==val,1,0)
    cd_test[varname]=np.where(cd_test['Issue']==val,1,0)
del cd_train['Issue']
del cd_test['Issue']



In [13]:

    
for col in cd_train.select_dtypes(['object']).columns:
    print(col,':',cd_train[col].nunique())



In [10]:

    
k=cd_train['State'].value_counts()
for val in k.axes[0][0:15]:
    varname='State_'+val.replace(',','_').replace(' ','_')
    cd_train[varname]=np.where(cd_train['State']==val,1,0)
    cd_test[varname]=np.where(cd_test['State']==val,1,0)
del cd_train['State']
del cd_test['State']



In [11]:

    
for col in ['Product','Submitted via','Company response to consumer','Timely response?']:
    
    temp=pd.get_dummies(cd_train[col],prefix=col,drop_first=True)
    cd_train=pd.concat([temp,cd_train],1)
    cd_train.drop([col],1,inplace=True)
    
    temp=pd.get_dummies(cd_test[col],prefix=col,drop_first=True)
    cd_test=pd.concat([temp,cd_test],1)
    cd_test.drop([col],1,inplace=True)



In [12]:

    
x = cd_train.drop(['Consumer disputed?','Complaint ID'],1)
y = cd_train['Consumer disputed?']



In [17]:

    
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score,accuracy_score,classification_report

Optimizing model...

Run train_test splits on the train data



In [14]:

    
ld_train, ld_test = train_test_split(cd_train, test_size=0.2, random_state=2)



In [15]:

    
x80_train = ld_train.drop(['Consumer disputed?','Complaint ID'],1)
y80_train = ld_train['Consumer disputed?']

x20_test = ld_test.drop(['Consumer disputed?','Complaint ID'],1)
y20_test = ld_test['Consumer disputed?']

1. Check ROC_AUC_SCORE {penalty='l1', class_weight=None}



In [19]:

    
model_logr1 = LogisticRegression(penalty="l1",class_weight=None,random_state=2)



In [20]:

    
model_logr1.fit(x80_train, y80_train)









    Out[20]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l1', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [21]:

    
#y20_test_pred = np.where(model_logr1.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
y20_test_pred = np.where(model_logr1.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])

y_test_pred = temp_df['Consumer disputed?']



In [23]:

    
roc_auc_score(y20_test, y_test_pred)









    Out[23]:





0.5

2. Check ROC_AUC_SCORE {penalty='l2', class_weight=None}



In [24]:

    
model_logrl2 = LogisticRegression(penalty="l2",class_weight=None,random_state=2)



In [25]:

    
model_logrl2.fit(x80_train, y80_train)









    Out[25]:





LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=2, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)



In [26]:

    
y20_test_pred = np.where(model_logrl2.predict(x20_test)==1,1,0)
temp_df = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred))), columns=['Complaint ID','Consumer disputed?'])

y_test_pred = temp_df['Consumer disputed?']



In [27]:

    
roc_auc_score(y20_test, y_test_pred)









    Out[27]:





0.4996866817311203

3. Check ROC_AUC_SCORE {penalty='l1', class_weight='balanced'}



In [28]:

    
model_logr2 = LogisticRegression(penalty="l1",class_weight="balanced",random_state=2)



In [29]:

    
model_logr2.fit(x80_train, y80_train)









    Out[29]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [30]:

    
y20_test_pred2 = np.where(model_logr2.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df2 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred2))),
                       columns=['Complaint ID','Consumer disputed?'])

y_test_pred2 = temp_df2['Consumer disputed?']



In [31]:

    
roc_auc_score(y20_test, y_test_pred2)









    Out[31]:





0.5775594371000643

4. Check ROC_AUC_SCORE {penalty='l2', class_weight='balanced'}



In [32]:

    
model_logr3 = LogisticRegression(penalty="l2",class_weight="balanced",random_state=2)



In [33]:

    
model_logr3.fit(x80_train, y80_train)









    Out[33]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l2', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [34]:

    
y20_test_pred3 = np.where(model_logr3.predict(ld_test.drop(['Complaint ID','Consumer disputed?'],1))==1,1,0)
temp_df3 = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(y20_test_pred3))),
                       columns=['Complaint ID','Consumer disputed?'])

y_test_pred3 = temp_df3['Consumer disputed?']



In [35]:

    
roc_auc_score(y20_test, y_test_pred3)









    Out[35]:





0.52907311874384377

2. Optimizing Model continues...

a. Employ CV procedure



In [36]:

    
from sklearn import cross_validation



In [38]:

    
predicted = cross_validation.cross_val_predict(model_logr2, x, y, cv=10)
print(accuracy_score(y, predicted))
print(classification_report(y, predicted))









    



0.568003510755
             precision    recall  f1-score   support

          0       0.84      0.56      0.67    249426
          1       0.27      0.60      0.37     67315

avg / total       0.72      0.57      0.61    316741

3. Cutoff based predicted probabilities



In [55]:

    
prob_score=pd.Series(list(zip(*model_logr2.predict_proba(x80_train)))[1])



In [56]:

    
cutoffs=np.linspace(0,1,100)

For each of these cutoff , we are going to look at TP,FP,TN,FN values and caluclate KS. Then we'll choose the best cutoff as the one having highest KS.



In [58]:

    
KS_cut=[]
for cutoff in cutoffs:
    predicted=pd.Series([0]*len(y80_train))
    predicted[prob_score>cutoff]=1
    df=pd.DataFrame(list(zip(y80_train,predicted)),columns=["real","predicted"])
    TP=len(df[(df["real"]==1) &(df["predicted"]==1) ])
    FP=len(df[(df["real"]==0) &(df["predicted"]==1) ])
    TN=len(df[(df["real"]==0) &(df["predicted"]==0) ])
    FN=len(df[(df["real"]==1) &(df["predicted"]==0) ])
    P=TP+FN
    N=TN+FP
    KS=(TP/P)-(FP/N)
    KS_cut.append(KS)

cutoff_data=pd.DataFrame(list(zip(cutoffs,KS_cut)),columns=["cutoff","KS"])

KS_cutoff=cutoff_data[cutoff_data["KS"]==cutoff_data["KS"].max()]["cutoff"]

Now we'll see how this model with the cutoff determined here , performs on the test data.



In [60]:

    
# Performance on test data
prob_score_test=pd.Series(list(zip(*model_logr2.predict_proba(x20_test)))[1])

predicted_test=pd.Series([0]*len(y20_test))
predicted_test[prob_score_test > float(KS_cutoff)]=1

df_test=pd.DataFrame(list(zip(y20_test,predicted_test)),columns=["real","predicted"])

k=pd.crosstab(df_test['real'],df_test["predicted"])
print('confusion matrix :\n \n ',k)
TN=k.iloc[0,0]
TP=k.iloc[1,1]
FP=k.iloc[0,1]
FN=k.iloc[1,0]
P=TP+FN
N=TN+FP









    



confusion matrix :
 
  predicted      0      1
real                   
0          24178  25699
1           4349   9123



In [61]:

    
# Accuracy of test
(TP+TN)/(P+N)









    Out[61]:





0.52567522770683039



In [62]:

    
# Sensitivity on test
TP/P









    Out[62]:





0.67718230403800472



In [63]:

    
#Specificity on test
TN/N









    Out[63]:





0.48475249112817531

Fit the optimized model on actual x,y and predict y from test dataset



In [39]:

    
model_logr2.fit(x,y)









    Out[39]:





LogisticRegression(C=1.0, class_weight='balanced', dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=2,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)



In [40]:

    
prediction = np.where(model_logr2.predict(cd_test.drop(['Complaint ID'],1))==1,"Yes","No")
submission = pd.DataFrame(list(zip(cd_test['Complaint ID'],list(prediction))),
                       columns=['Complaint ID','Consumer disputed?'])



In [49]:

    
pred_y = submission['Consumer disputed?']
actual_y = cd_train['Consumer disputed?']
# roc_auc_score(actual_y, pred_y) # This will fail since the probability pairs are one-one between y_actual and y_predicted



In [52]:

    
submission.head(4)









    Out[52]:






  
    
      
      Complaint ID
      Consumer disputed?
    
  
  
    
      0
      675956
      Yes
    
    
      1
      1858795
      No
    
    
      2
      32637
      Yes
    
    
      3
      1731374
      No



In [53]:

    
submission.to_csv('submission_new.csv',index=False)

This submission will get you auc score of approx 0.50, slightly less, try to increase the score.